########################################################
# Project:    Commission Communication
# Task:       Plot understandability of Comm PRs
# Author:     Christian Rauh (25.07.2022)
########################################################

# Packages ####
library(tidyverse) # 1.3.0
library(Hmisc) # 4.4-1
library(cowplot) # 1.1.0


# The data ####

comm <- read_rds("./Data/PR-Comm_Language.Rds") %>% 
  # filter(row_number() > 5) %>% # no texts in first 5 from 1985
  mutate(type = "Comm press releases") %>% 
  mutate(verbal = n_verb/n_noun) %>% 
  select(year, flesch, familiarity, nominal, verbal, type, meanSentenceLength, n_sentence, n_noun, n_verb, ntoken)

uk <- read_rds("./Data/PR-UK_Language.Rds") %>% 
  mutate(type = "UK press releases") %>% 
  mutate(verbal = n_verb/n_noun) %>% 
  select(year, flesch, familiarity, nominal, verbal, type, meanSentenceLength, n_sentence, n_noun, n_verb, ntoken)

ire <- read_rds("./Data/PR-IRE_Language.Rds") %>% 
  mutate(type = "IRE press releases") %>% 
  mutate(verbal = n_verb/n_noun) %>% 
  select(year, flesch, familiarity, nominal, verbal, type, meanSentenceLength, n_sentence, n_noun, n_verb, ntoken)

news <- read_rds("./Data/Comp-NewsBNC_Language.Rds") %>% 
  mutate(type = ifelse(newspaper == "Broadsheet", "Broadsheets NPs", "Tabloid NPs")) %>% 
  mutate(verbal = n_verb/n_noun) %>% 
  select(year, flesch, familiarity, nominal, verbal, type, meanSentenceLength, n_sentence, n_noun, n_verb, ntoken)

polsci <- read_rds("./Data/Comp-PolSci_Language.Rds") %>% 
  mutate(type = "PolSci abstracts") %>% 
  mutate(verbal = n_verb/n_noun) %>% 
  select(year, flesch, familiarity, nominal, verbal, type, meanSentenceLength, n_sentence, n_noun, n_verb, ntoken)

comb <- rbind(comm, uk, ire, news, polsci)  


# Filter faulty texts ####
# Scraping errors, faulty htmls, tabulated tables, missing punctuation etc.

df <- comb %>% 
  filter(meanSentenceLength > 3, # Average sentence length below three tokens is implausible
         meanSentenceLength < 150) %>% # Average sentence above 150 tokens is implausible
  filter(n_noun > 1 & n_verb > 1) %>% # At least one verb and one noun
  filter(ntoken >= 5) # At least 5 words

# test <- df %>%  filter(flesch < 0, type == "PolSci abstracts") 
# table(test$type)
# hist(test$meanSentenceLength)
# hist(df$meanSentenceLength)


# The plots ####

# Plotting vars
df$type2 <- factor(df$type, levels = c("Tabloid NPs",
                                       "Broadsheets NPs",
                                       "PolSci abstracts",
                                       "UK press releases",
                                       "IRE press releases",
                                       "Comm press releases"))

df$com <- ifelse(df$type == "Comm press releases", T, F)
df$pr <- ifelse(str_detect(df$type,"press releases"), T, F)
df$cat <- ifelse(df$com, "A", ifelse(str_detect(df$type,"UK|IRE"), "B", "C"))

colvals <- c("black", "grey25", "grey50")
sizevals <- c(5,10)
alphavals <- c(.4,1)


# Reading ease

re.cross <- ggplot(df, aes(x=type2, y=flesch, color = cat, alpha = com))+
  # stat_summary(geom = "pointrange", fun.data = "mean_cl_boot")+
  stat_summary(geom = "point", fun = "mean", size = 5)+
  scale_size_manual(values = sizevals) +
  scale_alpha_manual(values = alphavals) +
  scale_color_manual(values = colvals) +
  labs(title = "Easy to read?",
       x = "",
       y = " \nAverage Flesch/Kincaid\nreading ease score")+
  # coord_flip(ylim = c(0,100))
  facet_wrap(.~cat, scales = "free_y", ncol = 1)+
  coord_flip(ylim = c(10,70)) +
  theme(legend.position = "none",
        text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5, size = 20, face = "italic"),
        axis.text = element_text(color = "black", size = 12),
        axis.text.y = element_text(),
        axis.title = element_text(size = 14),
        strip.background = element_blank(), strip.text = element_blank(),
        panel.grid = element_line(size = 1.5))

re.temp <- ggplot(df[df$type == "Comm press releases" & df$year < 2021, ], 
       aes(x=year, y=flesch))+
  geom_jitter(width = .3, color = colvals[1], alpha = .03, size = .4)+
  geom_hline(yintercept = c(mean(df$flesch[df$type == "UK press releases"]),
                            mean(df$flesch[df$type == "IRE press releases"])),
             color = colvals[2], alpha = alphavals[1], size = 1.5)+
  geom_hline(yintercept = c(mean(df$flesch[df$type == "PolSci abstracts"]),
                            mean(df$flesch[df$type == "Broadsheets NPs"]),
                            mean(df$flesch[df$type == "Tabloid NPs"])),
             color = colvals[3], alpha = alphavals[1], size = 1.5)+
  # stat_summary(geom = "pointrange", fun.data = "mean_cl_boot", color = colvals[1], shape = 20)+
  stat_summary(geom = "line", fun = "mean", color = colvals[1], size = 1.5)+
  scale_x_reverse(breaks = seq(1985, 2020, 5))+
  labs(x="", y="")+
  coord_flip(ylim = c(10,70)) +
  theme(legend.position = "none",
        text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5, size = 20, face = "bold"),
        axis.text = element_text(color = "black", size = 12),
        axis.text.y = element_text(),
        axis.title = element_text(size = 14),
        panel.grid = element_line(size = 1.5))

re.pl <- plot_grid(re.cross, re.temp, ncol = 1, align = "v", rel_heights = c(.6,.4))
re.pl


# Familarity 

fam.cross <- ggplot(df, aes(x=type2, y=familiarity, color = cat, alpha = com))+
  # stat_summary(geom = "pointrange", fun.data = "mean_cl_boot")+
  stat_summary(geom = "point", fun = "mean", size = 5)+
  scale_size_manual(values = sizevals) +
  scale_alpha_manual(values = alphavals) +
  scale_color_manual(values = colvals) +
  labs(title = "Familiar words?",
       x = "",
       y = " \nAverage Google Books frequency\nof the words used")+
  # coord_flip(ylim = c(0,100))
  facet_wrap(.~cat, scales = "free_y", ncol = 1)+
  coord_flip(ylim = c(0.00235, 0.003)) +
  theme(legend.position = "none",
        text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5, size = 20, face = "italic"),
        axis.text = element_text(color = "black", size = 12),
        axis.text.y = element_text(),
        axis.title = element_text(size = 14),
        strip.background = element_blank(), strip.text = element_blank(),
        panel.grid = element_line(size = 1.5))

fam.temp <- ggplot(df[df$type == "Comm press releases" & df$year < 2021, ], 
                  aes(x=year, y=familiarity))+
  geom_jitter(width = .3, color = colvals[1], alpha = .03, size = .4)+
  geom_hline(yintercept = c(mean(df$familiarity[df$type == "UK press releases"]),
                            mean(df$familiarity[df$type == "IRE press releases"])),
             color = colvals[2], alpha = alphavals[1], size = 1.5)+
  geom_hline(yintercept = c(mean(df$familiarity[df$type == "PolSci abstracts"]),
                            mean(df$familiarity[df$type == "Broadsheets NPs"]),
                            mean(df$familiarity[df$type == "Tabloid NPs"])),
             color = colvals[3], alpha = alphavals[1], size = 1.5)+
  # stat_summary(geom = "linerange", fun.data = "mean_cl_boot", color = colvals[1], shape = 20)+
  stat_summary(geom = "line", fun = "mean", color = colvals[1], size = 1.5)+
  scale_x_reverse(breaks = seq(1985, 2020, 5))+
  labs(x="", y="")+
  coord_flip(ylim = c(0.00235, 0.003)) +
  theme(legend.position = "none",
        text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5, size = 20, face = "bold"),
        axis.text = element_text(color = "black", size = 12),
        axis.text.y = element_text(),
        axis.title = element_text(size = 14),
        panel.grid = element_line(size = 1.5))

fam.pl <- plot_grid(fam.cross, fam.temp, ncol = 1, align = "v", rel_heights = c(.6,.4))
fam.pl



# Verbal style

verb.cross <- ggplot(df, aes(x=type2, y=verbal, color = cat, alpha = com))+
  # stat_summary(geom = "pointrange", fun.data = "mean_cl_boot")+
  stat_summary(geom = "point", fun = "mean", size = 5)+
  scale_size_manual(values = sizevals) +
  scale_alpha_manual(values = alphavals) +
  scale_color_manual(values = colvals) +
  labs(title = "Focus on action?",
       x = "",
       y = " \nRatio of verbs\nto nouns")+
  # coord_flip(ylim = c(0,100))
  facet_wrap(.~cat, scales = "free_y", ncol = 1)+
  coord_flip(ylim = c(.25, 1)) +
  theme(legend.position = "none",
        text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5, size = 20, face = "italic"),
        axis.text = element_text(color = "black", size = 12),
        axis.text.y = element_text(),
        axis.title = element_text(size = 14),
        strip.background = element_blank(), strip.text = element_blank(),
        panel.grid = element_line(size = 1.5))

verb.temp <- ggplot(df[df$type == "Comm press releases" & df$year < 2021, ], 
                   aes(x=year, y=verbal))+
  geom_jitter(width = .3, color = colvals[1], alpha = .03, size = .4)+
  geom_hline(yintercept = c(mean(df$verbal[df$type == "UK press releases"]),
                            mean(df$verbal[df$type == "IRE press releases"])),
             color = colvals[2], alpha = alphavals[1], size = 1.5)+
  geom_hline(yintercept = c(mean(df$verbal[df$type == "PolSci abstracts"]),
                            mean(df$verbal[df$type == "Broadsheets NPs"]),
                            mean(df$verbal[df$type == "Tabloid NPs"])),
             color = colvals[3], alpha = alphavals[1], size = 1.5)+
  # stat_summary(geom = "pointrange", fun.data = "mean_cl_boot", color = colvals[1], shape = 20)+
  stat_summary(geom = "line", fun = "mean", color = colvals[1], size = 1.5)+
  scale_x_reverse(breaks = seq(1985, 2020, 5))+
  labs(x="", y="")+
  coord_flip(ylim = c(.25, 1)) +
  theme(legend.position = "none",
        text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5, size = 20, face = "bold"),
        axis.text = element_text(color = "black", size = 12),
        axis.text.y = element_text(),
        axis.title = element_text(size = 14),
        panel.grid = element_line(size = 1.5))

verb.pl <- plot_grid(verb.cross, verb.temp, ncol = 1, align = "v", rel_heights = c(.6,.4))
verb.pl

# test <- df[df$verbal == .5, ]
# illu <- data.frame(x = sample(10, 1000, replace = T),
#                    y = sample(10, 1000, replace = T)) %>% 
#   mutate(ratio = x/y)
# 
# plot(density(illu$ratio))


# Combined plot

plot_row <- plot_grid(re.pl, fam.pl, verb.pl, ncol = 3)

title <- ggdraw() + 
  draw_label(
    " \nCommunicating to the public...\nThe language of European Commission press releases in perspective\n",
    fontface = 'bold',
    fontfamily = "serif",
    # x = 0,
    hjust = .5,
    size = 22)+
  theme(plot.margin = margin(0, 0, 0, 120))

plot_row_t <- plot_grid(title, plot_row, ncol = 1, rel_heights = c(0.15, 1))

table(df$type)

fin <- add_sub(plot_row_t,
        label = "Text data: European Commission (45,045 press releases, RAPID); UK (85,399 press releases, gov.uk); IRE (6,671 press releases, gov.ie); Tabloids (22,160 paragraphs, BNC); Broadsheets (57,765 paragraphs, BNC),\nPolitical Science (2,332 abstracts, journal websites).  Author: @ChRauh / www.christian-rauh.eu",
        fontfamily = "serif",
        x = 0.1,
        size = 10,
        hjust = 0)

# ggdraw(fin)


ggsave("./Plots/Figure2_Color_greyscale.png", plot_row, width = 36, height = 24, units = "cm")
